
# append raw embeddings and patent_id data generated from gender_postprocess_embed.py
import numpy as np
import os
import pandas as pd
import logging

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

os.chdir("/Volumes/Zihao_SSD2/PatentsView/")


def append_embeddings(input_dir, out_dir, save: bool = True) -> np.array:
    """
    load patent embeddings
    Args:
        input_dir: input directory
        out_dir: output directory
        save: if save numpy matrix
    Return:
        npm: embedding matrix [n_patents, n_dim]
    """
    npx_list = []
    for i in range(0, 300, 50):
        if i == 250:
            npx = np.load(os.path.join(input_dir, "embedding_250_291.npz.npy"))
        else:
            npx = np.load(os.path.join(input_dir, f"embedding_{i}_{i+50}.npz.npy"))
        npx_list.append(npx)
        del npx
    npm = np.vstack(npx_list)
    del npx_list

    npm = npm[1:, :]
    logger.info(npm.shape)

    out_path = os.path.join(out_dir, "patentsberta_embedding_matrix")
    if save:
        np.save(out_path, npm)
        logger.info("embedding matrix saved.")

    return npm


def append_patent_id(out_dir=None):
    """
    create patent dictionary pd.DataFrame for [idx, patent_id, patent_year]
    where idx corrresponds to row in patent embedding matrix.
    this dataframe is sorted by patent_id and therefore monotonically increasing with grant year
    """
    df_list = []
    for i in range(0, 300, 50):
        if i == 250:
            df = pd.read_csv(f"patentsberta/patent_id/patent_id_250_291.csv")
        else:
            df = pd.read_csv(f"patentsberta/patent_id/patent_id_{i}_{i+50}.csv")
        df_list.append(df)
    df_out = pd.concat(df_list, ignore_index=True)
    df_out = df_out.loc[1:, :]
    df_out.reset_index(
        drop=True, inplace=True
    )  # remove first row which is wrong and reset index

    df_raw = pd.read_csv(
        "patentsberta/patent_raw.csv",
        usecols=["patent_id", "patent_year"],
        dtype={"patent_id": str, "patent_year": "int16"},
    )
    df_raw2 = df_raw[df_raw["patent_id"].str.isnumeric()]
    df_raw2["patent_id"] = df_raw2["patent_id"].astype(int)

    del df_raw
    logger.info(
        "keep only nuemric patent_ids and assume they are monotonically increasing with year for easiness of computation"
    )
    min_idx = df_raw2.index.min()
    max_idx = df_raw2.index.max()
    logger.info(f"min index: {min_idx}, max index {max_idx} inclusive")

    df_out = df_out.loc[min_idx:max_idx, :]  # note this is inclusive of max_idx
    df_out = df_out.reset_index()
    df_out["patent_id"] = df_out["patent_id"].astype(int)
    df_out.rename(columns={"index": "idx"}, inplace=True)
    df_out = df_out.merge(df_raw2, on=["patent_id"])

    # fix patents with wrong year and make patent_id monotonic increasing with year
    df_out.loc[df_out["patent_id"] == 4687998, "patent_year"] = 1987
    df_out.loc[df_out["patent_id"] == 4687999, "patent_year"] = 1987

    df_out["diff"] = df_out["patent_year"] - df_out["patent_year"].shift(1)
    df_out["diff"] = df_out["diff"].fillna(0)
    df_out.loc[df_out["diff"] < 0, "patent_year"] = (
        df_out["patent_year"] - df_out["diff"]
    )
    df_out.drop(columns=["diff"], inplace=True)
    assert df_out["patent_year"].is_monotonic == True
    df_out["patent_year"] = df_out["patent_year"].astype("int16")

    if out_dir:
        out_path = os.path.join(out_dir, "patent_dict.csv")
        df_out.to_csv(out_path, index=False)
        logger.info("patent dictionary saved!")
    return df_out


if __name__ == "__main__":
    input_dir = "patentsberta/"
    out_dir = "patentsberta/"
    append_embeddings(input_dir, out_dir, save=True)
    append_patent_id(out_dir="patentsberta/")
